Changes the age, city category and occupation variables from integers to factors, provides the variable types and checks for any missing values, sets seed, gets rid of scientific notation
BlackFridayDF <- read.csv("/Users/kathleengendotti/Downloads/BlackFriday.csv")
BlackFridayDF$Age <- as.factor(BlackFridayDF$Age)
BlackFridayDF$City_Category <- as.factor(BlackFridayDF$City_Category)
BlackFridayDF$Occupation <- as.factor(BlackFridayDF$Occupation)
colSums(is.na(BlackFridayDF))
## User_ID Product_ID
## 0 0
## Gender Age
## 0 0
## Occupation City_Category
## 0 0
## Stay_In_Current_City_Years Marital_Status
## 0 0
## Product_Category_1 Product_Category_2
## 0 166986
## Product_Category_3 Purchase
## 373299 0
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$Product_Category_1), ]
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$Product_Category_2), ]
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$Product_Category_3), ]
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$User_ID), ]
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$Product_ID), ]
colSums(is.na(BlackFridayDF))
## User_ID Product_ID
## 0 0
## Gender Age
## 0 0
## Occupation City_Category
## 0 0
## Stay_In_Current_City_Years Marital_Status
## 0 0
## Product_Category_1 Product_Category_2
## 0 0
## Product_Category_3 Purchase
## 0 0
set.seed(1861)
options(scipen=15)
Gives mean, median, 25th and 75th quartiles, min, and max of the variables
summary(BlackFridayDF)
## User_ID Product_ID Gender Age
## Min. :1000001 P00110742: 1591 F: 36932 0-17 : 4789
## 1st Qu.:1001497 P00025442: 1586 M:127346 18-25:30889
## Median :1003053 P00112142: 1539 26-35:65916
## Mean :1003000 P00057642: 1430 36-45:32758
## 3rd Qu.:1004418 P00184942: 1424 46-50:13135
## Max. :1006040 P00046742: 1417 51-55:11018
## (Other) :155291 55+ : 5773
## Occupation City_Category Stay_In_Current_City_Years Marital_Status
## 4 :22076 A:40848 0 :22061 Min. :0.0000
## 0 :20677 B:68185 1 :57297 1st Qu.:0.0000
## 7 :17542 C:55245 2 :31040 Median :0.0000
## 17 :13844 3 :28886 Mean :0.4022
## 1 :13209 4+:24994 3rd Qu.:1.0000
## 12 :10304 Max. :1.0000
## (Other):66626
## Product_Category_1 Product_Category_2 Product_Category_3 Purchase
## Min. : 1.000 Min. : 2.000 Min. : 3.00 Min. : 185
## 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 9.00 1st Qu.: 7871
## Median : 1.000 Median : 6.000 Median :14.00 Median :11757
## Mean : 2.742 Mean : 6.896 Mean :12.67 Mean :11661
## 3rd Qu.: 4.000 3rd Qu.:10.000 3rd Qu.:16.00 3rd Qu.:15627
## Max. :15.000 Max. :16.000 Max. :18.00 Max. :23959
##
Histogram of purchase
hist(BlackFridayDF$Purchase, col = 8, xlab = "Purchase", ylab = "Count", main = "Purchase vs. count")
This pie chart shows the distribution of total purchases by age group
library('plotly')
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
pieAge <- plot_ly(BlackFridayDF, labels = ~Age, values = ~Purchase, type = 'pie') %>%
layout(title = 'Age Group Purchases',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
pieAge
Pie chart of purchases by gender
pieGender <- plot_ly(BlackFridayDF, labels = ~Gender, values = ~Purchase, type = 'pie') %>%
layout(title = 'Gender Group Purchases',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
pieGender
These boxplots shows the distribution of purchases by gender
ggplot(aes(x=Gender, y=Purchase, fill=Gender), data = BlackFridayDF)+geom_boxplot()
Histogram plots of purchase by occupation
BlackFridayDF$Purchase <- log(BlackFridayDF$Purchase)
library("ggplot2")
ggplot(aes(Purchase), data = BlackFridayDF) + geom_histogram() + theme_light() + labs(title = "Plot of Total Purchases by Occupation", x = "Purchase (in dollars)") + facet_wrap(~Occupation)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This plot shows the frequency of various total purchase values colored by the number of years the individual has lived in their current city.
ggplot(aes(Purchase, fill = City_Category), data = BlackFridayDF) + geom_histogram() + theme_light() + labs(title = "Plot of Total Purchase Price Frequency by Number of Years Lived in Current City", x = "Purchase (in dollars)")+ facet_wrap(~Stay_In_Current_City_Years)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
These plots show the plots of purchases separated by the city catgeories and colored in by the number of years the individual has lived in their current city.
ggplot(aes(Purchase, fill = Stay_In_Current_City_Years), data = BlackFridayDF) + geom_histogram() + theme_light() + labs(title = "Plot of Total Purchase Price Frequency by City Catgeory", subtitle = "Colored by Number of Years Lived in Current City", x = "Purchase (in dollars)") + facet_wrap(~City_Category)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Linear Regression Model
Linear_regression <- lm(formula = Purchase ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
summary(Linear_regression)
##
## Call:
## lm(formula = Purchase ~ Gender + Age + Occupation + City_Category +
## Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0875 -0.2661 0.1432 0.4259 1.0320
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 9.065596 0.015098 600.452
## GenderM 0.068095 0.003668 18.567
## Age18-25 0.028712 0.014328 2.004
## Age26-35 0.035215 0.014343 2.455
## Age36-45 0.043312 0.014558 2.975
## Age46-50 0.031847 0.015266 2.086
## Age51-55 0.072111 0.015463 4.663
## Age55+ 0.037087 0.016528 2.244
## Occupation1 -0.017458 0.006673 -2.616
## Occupation2 -0.010424 0.008074 -1.291
## Occupation3 0.022003 0.009435 2.332
## Occupation4 0.011901 0.006212 1.916
## Occupation5 0.041413 0.010467 3.957
## Occupation6 0.033344 0.008977 3.714
## Occupation7 0.029981 0.006176 4.855
## Occupation8 0.043157 0.026014 1.659
## Occupation9 -0.014556 0.014726 -0.988
## Occupation10 -0.031097 0.015151 -2.053
## Occupation11 0.002420 0.011274 0.215
## Occupation12 0.052555 0.007196 7.303
## Occupation13 -0.005548 0.014804 -0.375
## Occupation14 0.056893 0.007752 7.339
## Occupation15 0.045649 0.010377 4.399
## Occupation16 0.035008 0.008217 4.260
## Occupation17 0.049470 0.006593 7.503
## Occupation18 0.016311 0.014068 1.159
## Occupation19 -0.062412 0.012899 -4.839
## Occupation20 -0.025280 0.007516 -3.363
## City_CategoryB 0.027296 0.003751 7.278
## City_CategoryC 0.098321 0.003961 24.821
## Stay_In_Current_City_Years1 0.012955 0.004729 2.739
## Stay_In_Current_City_Years2 0.023914 0.005248 4.557
## Stay_In_Current_City_Years3 0.012292 0.005339 2.302
## Stay_In_Current_City_Years4+ 0.015715 0.005510 2.852
## Marital_Status -0.003435 0.003199 -1.074
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## GenderM < 0.0000000000000002 ***
## Age18-25 0.045081 *
## Age26-35 0.014079 *
## Age36-45 0.002928 **
## Age46-50 0.036969 *
## Age51-55 0.0000031132431326 ***
## Age55+ 0.024845 *
## Occupation1 0.008892 **
## Occupation2 0.196671
## Occupation3 0.019693 *
## Occupation4 0.055405 .
## Occupation5 0.0000760330307448 ***
## Occupation6 0.000204 ***
## Occupation7 0.0000012067735603 ***
## Occupation8 0.097112 .
## Occupation9 0.322943
## Occupation10 0.040122 *
## Occupation11 0.830017
## Occupation12 0.0000000000002820 ***
## Occupation13 0.707857
## Occupation14 0.0000000000002160 ***
## Occupation15 0.0000108812696285 ***
## Occupation16 0.0000204208396818 ***
## Occupation17 0.0000000000000627 ***
## Occupation18 0.246255
## Occupation19 0.0000013090411186 ***
## Occupation20 0.000770 ***
## City_CategoryB 0.0000000000003410 ***
## City_CategoryC < 0.0000000000000002 ***
## Stay_In_Current_City_Years1 0.006155 **
## Stay_In_Current_City_Years2 0.0000052037786125 ***
## Stay_In_Current_City_Years3 0.021320 *
## Stay_In_Current_City_Years4+ 0.004346 **
## Marital_Status 0.282908
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5939 on 164243 degrees of freedom
## Multiple R-squared: 0.01058, Adjusted R-squared: 0.01038
## F-statistic: 51.68 on 34 and 164243 DF, p-value: < 0.00000000000000022
Generalized Linear Model
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
logitFit <- glm(Purchase ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
summary(logitFit)
##
## Call:
## glm(formula = Purchase ~ Gender + Age + Occupation + City_Category +
## Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0875 -0.2661 0.1432 0.4259 1.0320
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 9.065596 0.015098 600.452
## GenderM 0.068095 0.003668 18.567
## Age18-25 0.028712 0.014328 2.004
## Age26-35 0.035215 0.014343 2.455
## Age36-45 0.043312 0.014558 2.975
## Age46-50 0.031847 0.015266 2.086
## Age51-55 0.072111 0.015463 4.663
## Age55+ 0.037087 0.016528 2.244
## Occupation1 -0.017458 0.006673 -2.616
## Occupation2 -0.010424 0.008074 -1.291
## Occupation3 0.022003 0.009435 2.332
## Occupation4 0.011901 0.006212 1.916
## Occupation5 0.041413 0.010467 3.957
## Occupation6 0.033344 0.008977 3.714
## Occupation7 0.029981 0.006176 4.855
## Occupation8 0.043157 0.026014 1.659
## Occupation9 -0.014556 0.014726 -0.988
## Occupation10 -0.031097 0.015151 -2.053
## Occupation11 0.002420 0.011274 0.215
## Occupation12 0.052555 0.007196 7.303
## Occupation13 -0.005548 0.014804 -0.375
## Occupation14 0.056893 0.007752 7.339
## Occupation15 0.045649 0.010377 4.399
## Occupation16 0.035008 0.008217 4.260
## Occupation17 0.049470 0.006593 7.503
## Occupation18 0.016311 0.014068 1.159
## Occupation19 -0.062412 0.012899 -4.839
## Occupation20 -0.025280 0.007516 -3.363
## City_CategoryB 0.027296 0.003751 7.278
## City_CategoryC 0.098321 0.003961 24.821
## Stay_In_Current_City_Years1 0.012955 0.004729 2.739
## Stay_In_Current_City_Years2 0.023914 0.005248 4.557
## Stay_In_Current_City_Years3 0.012292 0.005339 2.302
## Stay_In_Current_City_Years4+ 0.015715 0.005510 2.852
## Marital_Status -0.003435 0.003199 -1.074
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## GenderM < 0.0000000000000002 ***
## Age18-25 0.045081 *
## Age26-35 0.014079 *
## Age36-45 0.002928 **
## Age46-50 0.036969 *
## Age51-55 0.0000031132431326 ***
## Age55+ 0.024845 *
## Occupation1 0.008892 **
## Occupation2 0.196671
## Occupation3 0.019693 *
## Occupation4 0.055405 .
## Occupation5 0.0000760330307448 ***
## Occupation6 0.000204 ***
## Occupation7 0.0000012067735603 ***
## Occupation8 0.097112 .
## Occupation9 0.322943
## Occupation10 0.040122 *
## Occupation11 0.830017
## Occupation12 0.0000000000002820 ***
## Occupation13 0.707857
## Occupation14 0.0000000000002160 ***
## Occupation15 0.0000108812696285 ***
## Occupation16 0.0000204208396818 ***
## Occupation17 0.0000000000000627 ***
## Occupation18 0.246255
## Occupation19 0.0000013090411186 ***
## Occupation20 0.000770 ***
## City_CategoryB 0.0000000000003410 ***
## City_CategoryC < 0.0000000000000002 ***
## Stay_In_Current_City_Years1 0.006155 **
## Stay_In_Current_City_Years2 0.0000052037786125 ***
## Stay_In_Current_City_Years3 0.021320 *
## Stay_In_Current_City_Years4+ 0.004346 **
## Marital_Status 0.282908
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3527084)
##
## Null deviance: 58550 on 164277 degrees of freedom
## Residual deviance: 57930 on 164243 degrees of freedom
## AIC: 295041
##
## Number of Fisher Scoring iterations: 2
BlackFridayDF$PurchaseHigh <- ifelse(BlackFridayDF$Purchase > median(BlackFridayDF$Purchase),1,0)
doBy::summaryBy(PurchaseHigh ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
tFit <- glm(PurchaseHigh ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF, family = binomial)
summary(tFit)
##
## Call:
## glm(formula = PurchaseHigh ~ Gender + Age + Occupation + City_Category +
## Stay_In_Current_City_Years + Marital_Status, family = binomial,
## data = BlackFridayDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3826 -1.1720 -0.9489 1.1668 1.4245
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -0.513669 0.051163 -10.040
## GenderM 0.236585 0.012436 19.024
## Age18-25 0.088867 0.048522 1.831
## Age26-35 0.079288 0.048578 1.632
## Age36-45 0.095505 0.049305 1.937
## Age46-50 0.059914 0.051705 1.159
## Age51-55 0.175756 0.052373 3.356
## Age55+ 0.045900 0.055962 0.820
## Occupation1 -0.062909 0.022599 -2.784
## Occupation2 -0.031010 0.027329 -1.135
## Occupation3 0.074070 0.031897 2.322
## Occupation4 0.063171 0.021000 3.008
## Occupation5 0.099729 0.035372 2.819
## Occupation6 0.115319 0.030344 3.800
## Occupation7 0.124448 0.020887 5.958
## Occupation8 0.115615 0.087946 1.315
## Occupation9 -0.079018 0.050148 -1.576
## Occupation10 -0.050735 0.051285 -0.989
## Occupation11 0.030601 0.038076 0.804
## Occupation12 0.190125 0.024361 7.805
## Occupation13 -0.112523 0.050038 -2.249
## Occupation14 0.176812 0.026232 6.740
## Occupation15 0.130918 0.035086 3.731
## Occupation16 0.096347 0.027770 3.469
## Occupation17 0.173334 0.022322 7.765
## Occupation18 0.050993 0.047540 1.073
## Occupation19 -0.111961 0.043750 -2.559
## Occupation20 -0.084326 0.025462 -3.312
## City_CategoryB 0.087646 0.012687 6.908
## City_CategoryC 0.315020 0.013417 23.478
## Stay_In_Current_City_Years1 0.045273 0.016007 2.828
## Stay_In_Current_City_Years2 0.089705 0.017767 5.049
## Stay_In_Current_City_Years3 0.050393 0.018076 2.788
## Stay_In_Current_City_Years4+ 0.037964 0.018646 2.036
## Marital_Status -0.009841 0.010828 -0.909
## Pr(>|z|)
## (Intercept) < 0.0000000000000002 ***
## GenderM < 0.0000000000000002 ***
## Age18-25 0.067026 .
## Age26-35 0.102643
## Age36-45 0.052744 .
## Age46-50 0.246550
## Age51-55 0.000791 ***
## Age55+ 0.412103
## Occupation1 0.005375 **
## Occupation2 0.256517
## Occupation3 0.020223 *
## Occupation4 0.002628 **
## Occupation5 0.004811 **
## Occupation6 0.000144 ***
## Occupation7 0.00000000255093537 ***
## Occupation8 0.188639
## Occupation9 0.115098
## Occupation10 0.322524
## Occupation11 0.421589
## Occupation12 0.00000000000000597 ***
## Occupation13 0.024528 *
## Occupation14 0.00000000001579251 ***
## Occupation15 0.000190 ***
## Occupation16 0.000521 ***
## Occupation17 0.00000000000000816 ***
## Occupation18 0.283430
## Occupation19 0.010495 *
## Occupation20 0.000927 ***
## City_CategoryB 0.00000000000491282 ***
## City_CategoryC < 0.0000000000000002 ***
## Stay_In_Current_City_Years1 0.004679 **
## Stay_In_Current_City_Years2 0.00000044386005160 ***
## Stay_In_Current_City_Years3 0.005305 **
## Stay_In_Current_City_Years4+ 0.041752 *
## Marital_Status 0.363414
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 227738 on 164277 degrees of freedom
## Residual deviance: 226132 on 164243 degrees of freedom
## AIC: 226202
##
## Number of Fisher Scoring iterations: 4
Lasso vs. Ridge Models
myFormula <- as.formula(Purchase ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status)
library("useful")
Xvar <- build.x(myFormula, data=BlackFridayDF)
Yvar <- build.y(myFormula, data=BlackFridayDF)
library("glmnet")
LassoMod <- cv.glmnet(x = Xvar, y = Yvar, alpha = 1, nfolds=10)
RidgeMod <- cv.glmnet(x = Xvar, y = Yvar, alpha = 0, nfolds=10)
Lassomin = coef(LassoMod, s = "lambda.min")
Lasso1se = coef(LassoMod, s = "lambda.1se")
Ridgemin = coef(RidgeMod, s = "lambda.min")
Ridge1se = coef(RidgeMod, s = "lambda.1se")
Lassomin = round(Lassomin, digits = 4)
Ridgemin = round(Ridgemin, digits = 4)
Lasso1se = round(Lasso1se, digits = 4)
Ridge1se = round(Ridge1se, digits = 4)
table1 = cbind(Lassomin,Ridgemin)
colnames(table1) = c("Lasso", "Ridge")
print(table1)
## 36 x 2 sparse Matrix of class "dgCMatrix"
## Lasso Ridge
## (Intercept) 9.0728 9.0857
## (Intercept) . .
## GenderM 0.0682 0.0678
## Age18-25 0.0214 0.0106
## Age26-35 0.0278 0.0167
## Age36-45 0.0359 0.0249
## Age46-50 0.0244 0.0134
## Age51-55 0.0647 0.0534
## Age55+ 0.0297 0.0187
## Occupation1 -0.0173 -0.0179
## Occupation2 -0.0103 -0.0109
## Occupation3 0.0222 0.0214
## Occupation4 0.0120 0.0113
## Occupation5 0.0416 0.0409
## Occupation6 0.0335 0.0329
## Occupation7 0.0301 0.0295
## Occupation8 0.0430 0.0421
## Occupation9 -0.0143 -0.0149
## Occupation10 -0.0370 -0.0462
## Occupation11 0.0026 0.0022
## Occupation12 0.0527 0.0519
## Occupation13 -0.0053 -0.0055
## Occupation14 0.0570 0.0561
## Occupation15 0.0458 0.0451
## Occupation16 0.0352 0.0345
## Occupation17 0.0496 0.0490
## Occupation18 0.0164 0.0160
## Occupation19 -0.0629 -0.0639
## Occupation20 -0.0251 -0.0256
## City_CategoryB 0.0272 0.0263
## City_CategoryC 0.0983 0.0969
## Stay_In_Current_City_Years1 0.0130 0.0125
## Stay_In_Current_City_Years2 0.0239 0.0234
## Stay_In_Current_City_Years3 0.0123 0.0118
## Stay_In_Current_City_Years4+ 0.0157 0.0153
## Marital_Status -0.0033 -0.0030
table2 = cbind(Lasso1se, Ridge1se)
colnames(table2) = c("Lasso", "Ridge")
print(table2)
## 36 x 2 sparse Matrix of class "dgCMatrix"
## Lasso Ridge
## (Intercept) 9.1970 9.2128
## (Intercept) . .
## GenderM 0.0240 0.0128
## Age18-25 . -0.0022
## Age26-35 . -0.0008
## Age36-45 . 0.0023
## Age46-50 . -0.0001
## Age51-55 . 0.0072
## Age55+ . 0.0022
## Occupation1 . -0.0070
## Occupation2 . -0.0065
## Occupation3 . -0.0011
## Occupation4 . -0.0022
## Occupation5 . 0.0052
## Occupation6 . 0.0025
## Occupation7 . 0.0043
## Occupation8 . 0.0061
## Occupation9 . -0.0106
## Occupation10 . -0.0116
## Occupation11 . -0.0004
## Occupation12 . 0.0083
## Occupation13 . 0.0021
## Occupation14 . 0.0073
## Occupation15 . 0.0061
## Occupation16 . 0.0050
## Occupation17 . 0.0094
## Occupation18 . 0.0035
## Occupation19 . -0.0138
## Occupation20 . -0.0089
## City_CategoryB . -0.0035
## City_CategoryC 0.0355 0.0136
## Stay_In_Current_City_Years1 . 0.0000
## Stay_In_Current_City_Years2 . 0.0023
## Stay_In_Current_City_Years3 . -0.0003
## Stay_In_Current_City_Years4+ . 0.0010
## Marital_Status . 0.0009
r2Lassomin <- LassoMod$glmnet.fit$dev.ratio[which(LassoMod$glmnet.fit$lambda == LassoMod$lambda.min)]
r2Lasso1se <- LassoMod$glmnet.fit$dev.ratio[which(LassoMod$glmnet.fit$lambda == LassoMod$lambda.1se)]
r2Lassomin
## [1] 0.0105828
r2Lasso1se
## [1] 0.004431786
r2Ridgemin <- RidgeMod$glmnet.fit$dev.ratio[which(RidgeMod$glmnet.fit$lambda == RidgeMod$lambda.min)]
r2Ridge1se <- RidgeMod$glmnet.fit$dev.ratio[which(RidgeMod$glmnet.fit$lambda == RidgeMod$lambda.1se)]
r2Ridgemin
## [1] 0.01057293
r2Ridge1se
## [1] 0.00356048
Forward Stepwise Model
library("leaps")
#was not sure on good value to set for nvmax so went with the default of 8
BFFitFwd <- regsubsets(Purchase ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status , data = BlackFridayDF, nvmax = 8, method = "forward")
summary(BFFitFwd)
## Subset selection object
## Call: regsubsets.formula(Purchase ~ Gender + Age + Occupation + City_Category +
## Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF,
## nvmax = 8, method = "forward")
## 34 Variables (and intercept)
## Forced in Forced out
## GenderM FALSE FALSE
## Age18-25 FALSE FALSE
## Age26-35 FALSE FALSE
## Age36-45 FALSE FALSE
## Age46-50 FALSE FALSE
## Age51-55 FALSE FALSE
## Age55+ FALSE FALSE
## Occupation1 FALSE FALSE
## Occupation2 FALSE FALSE
## Occupation3 FALSE FALSE
## Occupation4 FALSE FALSE
## Occupation5 FALSE FALSE
## Occupation6 FALSE FALSE
## Occupation7 FALSE FALSE
## Occupation8 FALSE FALSE
## Occupation9 FALSE FALSE
## Occupation10 FALSE FALSE
## Occupation11 FALSE FALSE
## Occupation12 FALSE FALSE
## Occupation13 FALSE FALSE
## Occupation14 FALSE FALSE
## Occupation15 FALSE FALSE
## Occupation16 FALSE FALSE
## Occupation17 FALSE FALSE
## Occupation18 FALSE FALSE
## Occupation19 FALSE FALSE
## Occupation20 FALSE FALSE
## City_CategoryB FALSE FALSE
## City_CategoryC FALSE FALSE
## Stay_In_Current_City_Years1 FALSE FALSE
## Stay_In_Current_City_Years2 FALSE FALSE
## Stay_In_Current_City_Years3 FALSE FALSE
## Stay_In_Current_City_Years4+ FALSE FALSE
## Marital_Status FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## GenderM Age18-25 Age26-35 Age36-45 Age46-50 Age51-55 Age55+
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " "*" " "
## Occupation1 Occupation2 Occupation3 Occupation4 Occupation5
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " "
## Occupation6 Occupation7 Occupation8 Occupation9 Occupation10
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " "*"
## 4 ( 1 ) " " " " " " " " "*"
## 5 ( 1 ) " " " " " " " " "*"
## 6 ( 1 ) " " " " " " " " "*"
## 7 ( 1 ) " " " " " " " " "*"
## 8 ( 1 ) " " " " " " " " "*"
## Occupation11 Occupation12 Occupation13 Occupation14 Occupation15
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " "
## Occupation16 Occupation17 Occupation18 Occupation19 Occupation20
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " "*" " "
## 6 ( 1 ) " " " " " " "*" "*"
## 7 ( 1 ) " " " " " " "*" "*"
## 8 ( 1 ) " " " " " " "*" "*"
## City_CategoryB City_CategoryC Stay_In_Current_City_Years1
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" " "
## 7 ( 1 ) "*" "*" " "
## 8 ( 1 ) "*" "*" " "
## Stay_In_Current_City_Years2 Stay_In_Current_City_Years3
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " " "
## 7 ( 1 ) " " " "
## 8 ( 1 ) " " " "
## Stay_In_Current_City_Years4+ Marital_Status
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " " "
## 7 ( 1 ) " " " "
## 8 ( 1 ) " " " "
#created separate variables for city catrgories B and C since the forward stepwise model chose them
BlackFridayDF$City_CategoryB <- ifelse(BlackFridayDF$City_Category == "B",1,0)
BlackFridayDF$City_CategoryC <- ifelse(BlackFridayDF$City_Category == "C",1,0)
#created separate variable for the age group of people between 51 and 55
BlackFridayDF$Age51 <- ifelse(BlackFridayDF$Age == "51-55",1,0)
#created separate variables for the occupations the model chose
BlackFridayDF$Occupation1 <- ifelse(BlackFridayDF$Occupation == "1",1,0)
BlackFridayDF$Occupation10 <- ifelse(BlackFridayDF$Occupation == "10",1,0)
BlackFridayDF$Occupation19 <- ifelse(BlackFridayDF$Occupation == "19",1,0)
BlackFridayDF$Occupation20 <- ifelse(BlackFridayDF$Occupation == "20",1,0)
lmDayDF <- lm(Purchase ~ Gender + City_CategoryB + City_CategoryC + Age51 + Occupation1 + Occupation10 + Occupation19 + Occupation20, data = BlackFridayDF)
summary(lmDayDF)
##
## Call:
## lm(formula = Purchase ~ Gender + City_CategoryB + City_CategoryC +
## Age51 + Occupation1 + Occupation10 + Occupation19 + Occupation20,
## data = BlackFridayDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0752 -0.2652 0.1459 0.4273 1.0323
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.131105 0.004144 2203.393 < 0.0000000000000002 ***
## GenderM 0.074262 0.003529 21.044 < 0.0000000000000002 ***
## City_CategoryB 0.028850 0.003726 7.744 0.00000000000000971 ***
## City_CategoryC 0.100927 0.003900 25.878 < 0.0000000000000002 ***
## Age51 0.036187 0.005883 6.151 0.00000000077334193 ***
## Occupation1 -0.040556 0.005442 -7.452 0.00000000000009254 ***
## Occupation10 -0.083945 0.009270 -9.055 < 0.0000000000000002 ***
## Occupation19 -0.091420 0.012247 -7.464 0.00000000000008412 ***
## Occupation20 -0.048856 0.006463 -7.560 0.00000000000004057 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5942 on 164269 degrees of freedom
## Multiple R-squared: 0.009239, Adjusted R-squared: 0.00919
## F-statistic: 191.5 on 8 and 164269 DF, p-value: < 0.00000000000000022
Calculates RMSE and MSE over all the models
RMSE = function(x1,x2) {
sqrt(mean(x1-x2)^2)
}
lmRMSE = RMSE(predict(Linear_regression, newx = Xvar), BlackFridayDF$Purchase)
ltRMSE = RMSE(predict(logitFit, newx = Xvar), BlackFridayDF$Purchase)
LassoRMSE = RMSE(predict(LassoMod, newx = Xvar), BlackFridayDF$Purchase)
RidgeRMSE = RMSE(predict(RidgeMod, newx = Xvar), BlackFridayDF$Purchase)
lmDayDFRMSE = RMSE(predict(lmDayDF, newx = Xvar), BlackFridayDF$Purchase)
MSE <- function(yhat, ytrue){
mean((yhat - ytrue)^2)
}
lmMSE = MSE(predict(Linear_regression, newx = Xvar), BlackFridayDF$Purchase)
ltMSE = MSE(predict(logitFit, newx = Xvar), BlackFridayDF$Purchase)
LassoMSE = MSE(predict(LassoMod, newx = Xvar), BlackFridayDF$Purchase)
RidgeMSE = MSE(predict(RidgeMod, newx = Xvar), BlackFridayDF$Purchase)
lmDayDFMSE = RMSE(predict(lmDayDF, newx = Xvar), BlackFridayDF$Purchase)
compareModels <- matrix(c(lmRMSE,ltRMSE, LassoRMSE,RidgeRMSE, lmDayDFRMSE, lmMSE, ltMSE, LassoMSE,RidgeMSE, lmDayDFMSE),ncol=2,byrow=TRUE)
colnames(compareModels) <- c("RMSE","MSE")
rownames(compareModels) <- c("Linear Regression", "Generalized Linear","Lasso","Ridge", "Forward Stepwise")
compareModels <- as.table(compareModels)
compareModels
## RMSE MSE
## Linear Regression 0.000000000000004791599 0.000000000000004791599
## Generalized Linear 0.000000000000062269982 0.000000000000066568344
## Lasso 0.000000000000003284116 0.352633274631699078494
## Ridge 0.352633274631699078494 0.354826116309063299692
## Forward Stepwise 0.355136654667222717574 0.000000000000003284116